SURVDAT pull comparison

Digging into differences in the fall Bigelow survey data across different pulls of survdat

Objective is to isolate which main columns are inconsistent across the datasets and whether the problem is isolated to specific species.

Load SURVDAT Sources

# 2016
load(paste0(mills_path, "Projects/WARMEM/Old survey data/Survdat_Nye2016.RData"))
survdat_16 <- clean_names(survdat) %>% filter(est_year >= 2000)


# 2019
load(paste0(res_path, "NMFS_trawl/Survdat_Nye_allseason.RData"))
survdat_19 <- clean_names(survdat) %>% filter(est_year >= 2000)

# 2020
load(paste0(res_path, "NMFS_trawl/Survdat_Nye_Aug 2020.RData"))
survdat_20 <- clean_names(survdat) %>% filter(est_year >=  2000)

# Double check of 2020 from Janet on 01282021
load(paste0(res_path, "NMFS_trawl/Survdat_Nye_Aug 2020_check01282021.RData"))
survdat_20_check <- clean_names(survdat) %>% filter(est_year >=  2000)

# 2021
load(paste0(res_path, "NMFS_trawl/survdat_slucey_01152021.RData"))
survdat_21 <- clean_names(survdat) %>% filter(year >= 2000)

# # Julek / Paul Kostovik
# survdat_julek <- readxl::read_xls(paste0(res_path, "NMFS_trawl/GMRI_cod_haddock_yt_1970-2016station_catch_length.xls")) %>%
#   clean_names()
# 
# # this file is somewhere between the rest on names so a few renames are in order
# # names(survdat_julek)
# survdat_julek <- survdat_julek %>%
#   rename(cruise6     = cruise,
#          svvessel    = vessel,
#          lat         = decdeg_beglat,
#          lon         = decdeg_beglon,
#          depth       = avg_depth,
#          surftemp    = suface_temperature,
#          surfsalin   = surface_salininty,
#          bottemp     = bottom_temperature,
#          botsalin    = bottom_salinity,
#          est_towdate = begin_gmt_towdate,
#          biomass     = expanded_catch_wt,
#          abundance   = expanded_catch_number,
#          comname     = logged_species_name,
#          numlen      = expanded_number_at_length)

# remove survdat
rm(survdat)

Run Cleanup Function

The cleanup function does a number of steps to each survdat source. This includes the removal of specific strata, the removal of non-spring/fall seasons, removal of vessels other than the Albatross or Bigelow, the dropping of NA biomass and abundance events, and the removal of certain species like shrimp.

It also matches up the stations with larger regions they fall within and the survey effort within each of those. This should not have an impact on any of the things we are tracking below.

The code for the cleanup function can be found here

# Cleanup functions
source(here("R/01_nefsc_ss_build.R"))

# Clean each up using the survdat prep function
trawldat_16 <- survdat_prep(survdat = survdat_16) %>% 
  mutate(survdat_source = "2016")
trawldat_19 <- survdat_prep(survdat = survdat_19) %>% 
  mutate(survdat_source = "2019")
trawldat_20 <- survdat_prep(survdat = survdat_20) %>% 
  mutate(survdat_source = "2020")
trawldat_20_check <- survdat_prep(survdat = survdat_20_check) %>% 
  mutate(survdat_source = "2020_check")
trawldat_21 <- survdat_prep(survdat = survdat_21) %>% 
  mutate(survdat_source = "2021")

# # try the julek dataset, can't pass it through build without sex column
# trawldat_julek <- survdat_prep(survdat = survdat_julek) %>%
#   mutate(survdat_source = "julek")


rm(survdat_16, survdat_19, survdat_20, survdat_20_check, survdat_21)

Conversion Factor Comparisons

The “conversion factor” is a number representing the scale of mismatch between the abundance of a species caught at in a tow when measured using the survdat$abundance column compared to the sum of survdat$numlen across all the lengths caught.

In theory they should be equal. Numbers greater than 1 indicate that the abundance column was greater than the sum of individuals at the different lengths.

# This is the same code used to make numlen_adj in the build:
get_convers <- function(trawldat){
  abundance_check <- trawldat %>%
    group_by(id, comname, catchsex, abundance) %>%
    summarise(
      numlen_abund = sum(numlen),                
      n_len_class  = n_distinct(length), # number of distinct lengths caught that station
      .groups      = "keep") %>% 
    ungroup()
  
  # Conversion factor translates each size at length to what it would need be to be
  # for them to add up to that "abundance" column
  conv_factor <- trawldat %>% 
    distinct(id, comname, catchsex, length, abundance) %>% 
    inner_join(abundance_check, by = c("id", "comname", "catchsex", "abundance")) %>% 
    mutate(convers = abundance / numlen_abund)
    
  
  
  # Merge back and convert the numlen field
  trawldat_adjusted <- trawldat %>%
    left_join(conv_factor, by = c("id", "comname", "catchsex", "length", "abundance", "n_len_class")) %>%
    mutate(numlen_adj = numlen * convers, .after = numlen) 
  
  return(trawldat_adjusted)
}



# Put all the groups together to compare conversion factors
trawldat_list <- list(
    "2016" = get_convers(trawldat_16),
    "2019" = get_convers(trawldat_19),
    "2020" = get_convers(trawldat_20),
    "2020 re-check" = get_convers(trawldat_20_check),
    "2021" = get_convers(trawldat_21))

trawldat_combined <- bind_rows(
  trawldat_list, 
  .id = "survdat_source"
)

All Species

trawldat_combined %>% 
  mutate(season = fct_rev(season)) %>% 
  ggplot(aes(y = convers, 
             x  = survdat_source, 
             color = survdat_source)) +
  geom_boxplot() +
  facet_grid(season ~ svvessel) + 
  scale_color_gmri() +
  labs(subtitle = "numlen to abundance conversion value - All Species",
       y = "(survdat$abundance / sum(numlen))",
       x = "",
       caption = "Conversion value calculated at each station for each species & sex.\nMay reflect change in catch composition.")

Haddock Only

trawldat_combined %>% 
  filter(comname == "haddock") %>% 
  mutate(season = fct_rev(season)) %>% 
  ggplot(aes(y = convers, 
             x  = survdat_source, 
             color = survdat_source)) +
  geom_boxplot() +
  facet_grid(season ~ svvessel) + 
  scale_color_gmri() +
  labs(subtitle = "numlen to abundance conversion value - Haddock only",
       y = "(survdat$abundance / sum(numlen))",
       x = "",
       caption = "Conversion value calculated at each station/sex for haddock.")

Single Species Checks

For each of the following species we are comparing 4 main things across the different survdat sources:

  1. The number of unique station id’s each year
  2. The total number of fish, as tallied from the survdat$numlen column
  3. The total abundance of fish as the sum of survdat$abundance from distinct stations and the species caught at them
  4. The total biomass for each year as the sum of survdat$biomass from distinct stations and the species caught at them

NOTE: The Below code chunk details the functions used to isolate a given species and pull its information for display. Subsequent tabs will detail the differences among the different survdat pulls.

# Funciton to pull single species data
species_select <- function(species_choice){
  
  # Combine data from each source for plotting
  # source_list <- list(trawldat_16, trawldat_19, 
  #                     trawldat_20, trawldat_21)
  
  source_list <- trawldat_list
  
  # Use map to not repeat so much code
  select_species <- map_dfr(source_list, function(surv_source){
    surv_source %>% 
      select(survdat_source, id, est_year, season, comname, length, 
             numlen, abundance, biom_adj) %>% 
      filter(comname == species_choice)
  })
  
  return(select_species)
}

# Plotting Numlen Totals and Station counts
plot_stations <- function(combined_data, species_choice){
  
  # Total numlen fish
  species_numlens <- combined_data %>% 
    group_by(survdat_source, est_year, season) %>% 
    summarise(n_stations = n_distinct(id),
              sum_numlen = sum(numlen, na.rm = T),
              .groups = "keep")  %>%  
    ungroup() %>% 
    mutate(season = fct_rev(season))
  
  
  
  station_plot <- ggplot(species_numlens, 
                         aes(est_year, n_stations, color = survdat_source)) +
    geom_line() +
    facet_wrap(~season) +
    scale_color_gmri() +
    labs(x = "", y = "total stations")
  
  numlen_plot <- ggplot(species_numlens, 
                        aes(est_year, sum_numlen, color = survdat_source)) +
    geom_line() +
    facet_wrap(~season) +
    scale_color_gmri() +
    labs(x = "", y = "total numlen abundance", 
         caption = paste0(species_choice, " only"))
  
  plot_out <- station_plot / numlen_plot
  return(plot_out)
}

# Plotting Biomass and Abundances
plot_biomass <- function(combined_data, species_choice){
  # Abundance and Biomass
  species_abund_bio <- combined_data %>% 
    distinct(survdat_source, est_year, season, id, abundance, biom_adj) %>% 
    group_by(survdat_source, est_year, season) %>% 
    summarise(n_stations = n_distinct(id),
              abundance = sum(abundance, na.rm = T),
              biomass = sum(biom_adj, na.rm = T),
              .groups = "keep") %>%  
    ungroup() %>% 
    mutate(season = fct_rev(season))
  
  abund_plot <- ggplot(species_abund_bio, 
                       aes(est_year, abundance, color = survdat_source)) +
    geom_line() +
    facet_wrap(~season) +
    scale_color_gmri() +
    labs(x = "", y = "Total survdat$abundance")
  
  
  bio_plot <- ggplot(species_abund_bio, aes(est_year, biomass, color = survdat_source)) +
    geom_line() +
    facet_wrap(~season) +
    scale_color_gmri() +
    labs(x = "", y = "Total survdat$biomass", 
         caption = paste0(species_choice, " only"))
  
  plot_out <- abund_plot / bio_plot
  return(plot_out)
  
}

Haddock

Stations and Numlen Totals

species_choice <- "haddock"
single_species <- species_select(species_choice)
plot_stations(single_species, species_choice)

Abundance and Biomass

plot_biomass(single_species, species_choice)

Spiny dogfish

Stations and Numlen Totals

species_choice <- "spiny dogfish"
single_species <- species_select(species_choice)
plot_stations(single_species, species_choice)

Abundance and Biomass

plot_biomass(single_species, species_choice)

Atlantic Cod

Stations and Numlen Totals

species_choice <- "atlantic cod"
single_species <- species_select(species_choice)
plot_stations(single_species, species_choice)

Abundance and Biomass

plot_biomass(single_species, species_choice)

Atlantic Herring

Stations and Numlen Totals

species_choice <- "atlantic herring"
single_species <- species_select(species_choice)
plot_stations(single_species, species_choice)

Abundance and Biomass

plot_biomass(single_species, species_choice)

Dataset Contents

This section will detail how each data set compares along a handful of common metrics. How many stations there are, how much biomass there is, which strata are present etc.

These should all be about the same, but may point us in the direction of what might be causing these differences.

trawldat_combined %>% 
  group_by(survdat_source) %>% 
  summarise(
    n_years = n_distinct(est_year),
    n_strata = n_distinct(stratum),
    n_species = n_distinct(comname))%>% 
  kable() %>% 
  kable_styling()
survdat_source n_years n_strata n_species
2016 16 53 323
2019 19 53 327
2020 20 53 333
2020 re-check 20 53 333
2021 20 53 325

Summary

Datasets seem to fall into 3 distinct groups when looking at abundance:

1. Pre-2020

2. 2020 Survdat Nye Data

3. 2021 Survdat Pull

And two different groups when looking at biomass:

1. 2021 Survdat Pull

2. Everything else

species_choice <- "atlantic cod"
single_species <- species_select(species_choice) 



cod_sources <- single_species %>% 
  mutate(`survdat group` = case_when(
    survdat_source %in% c("2016", "2019") ~ "Pre-2020 Data",
    str_detect(survdat_source, "2020") ~ "2020 Data",
    survdat_source == "2021" ~ "Sean's 2021 Data"),
    `survdat group` = factor(`survdat group`, 
                             levels = c("Pre-2020 Data", "2020 Data", "Sean's 2021 Data"))) %>% 
  distinct(`survdat group`, survdat_source, est_year, season, id, abundance, biom_adj) %>% 
  group_by(`survdat group`, survdat_source, est_year, season) %>% 
  summarise(n_stations = n_distinct(id),
            abundance = sum(abundance, na.rm = T),
            biomass = sum(biom_adj, na.rm = T),
            .groups = "keep") %>%  
    ungroup() %>% 
    mutate(season = fct_rev(season))
  

#ellipse marker
mark1 <- cod_sources %>% 
  filter(season == "FALL", survdat_source == "2021", est_year == 2009) 

Abundance

# Abundance
cod_sources %>% 
  filter(season == "FALL") %>% 
  ggplot(aes(est_year, abundance)) +
  geom_line(aes(group = survdat_source, color = `survdat group`), alpha = 0.3) +
  geom_jitter(aes(group = survdat_source, color = `survdat group`, shape = survdat_source), 
              width = 0.15) +
  geom_mark_ellipse(data = mark1, 
                    aes(est_year, abundance, label = "Separation of the 3 groups"), 
                    linetype = 2) +
  scale_color_gmri() +
  facet_zoom(xlim = c(2008:2011)) +
  labs(x = "", 
       y = "Total survdat$abundance", 
       title = "Data Mismatches Following Vessel Switch", 
       subtitle = "Inconsistency in Bigelow data over time.", 
       caption = paste0(species_choice, " only"),
       shape = "Pull Source",
       color = "Consistent Grouping") 

Biomass

# Biomass
cod_sources %>% 
  filter(season == "FALL") %>% 
  ggplot(aes(est_year, biomass)) +
  geom_line(aes(group = survdat_source, color = `survdat group`), alpha = 0.3) +
  geom_jitter(aes(group = survdat_source, color = `survdat group`, shape = survdat_source), 
              width = 0.15) +
  geom_mark_ellipse(data = mark1, 
                    aes(est_year, biomass, label = "Separation of the 2 groups"), 
                    linetype = 2) +
  scale_color_gmri() +
  facet_zoom(xlim = c(2007:2013)) +
  labs(x = "", 
       y = "Total survdat$biomass", 
       title = "Data Mismatches Following Vessel Switch", 
       subtitle = "Inconsistency in Bigelow data over time.", 
       caption = paste0(species_choice, " only"),
       shape = "Pull Source",
       color = "Consistent Grouping") 

 

A work by Adam A. Kemberling

Akemberling@gmri.org